Question 1: Male and Female Graduate Enrollment (1967-2015)

# Exploring the data

enroll_explore <- ggplot(enroll_tidy_df, aes(x = Year, y = Enrollment)) +
  geom_point(aes(color = Sex))

enroll_explore

# Female enrollment looks to be increasing at a much faster rate than male enrollment. Male enrollment was initially higher, but in the late 1980's, female enrollment surpassed male enrollement. Both enrollments level out at 2010. Male enrollment goes through stagnant periods and short spurts of increasing.


# Running a linear regression on dependent variable (y = male enrollment) by independent variale (x = year)

enroll_male_lm1 <- lm(`total Males` ~ Year, data = grad_enroll_df)

summary(enroll_male_lm1)
## 
## Call:
## lm(formula = `total Males` ~ Year, data = grad_enroll_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17112153    1087024  -15.74   <2e-16 ***
## Year             9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
plot(enroll_male_lm1)

# Running a linear regression on dependent variable (y = female enrollment) by independent variable (x = year)

enroll_female_lm1 <- lm(`total Females` ~ Year, data = grad_enroll_df)

summary(enroll_female_lm1)
## 
## Call:
## lm(formula = `total Females` ~ Year, data = grad_enroll_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89397 -48101  -7633  45267 129727 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.896e+07  1.161e+06  -50.77   <2e-16 ***
## Year         3.013e+04  5.832e+02   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
plot(enroll_female_lm1)

# To show the trends graphically, we will plot a graph over time of female and male enrollment to show the differences in trends.

enroll_tidy_df1 <- enroll_tidy_df %>% 
  mutate(enrollment = Enrollment/1000000)

grad_enroll_graph <- ggplot(enroll_tidy_df1, aes(x = Year, y = enrollment)) +
  geom_line(aes(color = Sex), size = 1) +
  theme_linedraw() +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 2, .25), limits = c(0,2)) +
  labs(x = "Year", y = "Total Graduate Enrollment (Millions of Students)", title ="Graduate Enrollment in the United States (1967-2015)") + scale_color_brewer(palette = "Pastel1")

grad_enroll_graph
<<<<<<< HEAD

=======

>>>>>>> 505f1ec49d937332e8fe90df0e3eb13e1a3f25d2

Question 2: Shifts in Female PhD Recipients by Field (1985,2000,2015)

#Describe if and how there was a shift in PhDs awarded to females in four fields (Physical and Earth Sciences, Engineering, Education, and Humanities & Arts) in 1985, 2000, and 2015. Describe your results statistically, in a graph or table, and in text. 
#Tried to make a dataframe to graph the trends.. this was attempt 1:
phd_female_fields <- phd_field_df %>% 
  select("Field of study and sex", "Gender", "Number 1985", "Percent 1985", "Number 2000", "Percent 2000", "Number 2015", "Percent 2015") %>% 
  filter(Gender == "Female", `Field of study and sex` == "Physical sciences and earth sciences" | `Field of study and sex` == "Engineering" | `Field of study and sex` == "Education" | `Field of study and sex` == "Humanities and arts")
#but realized it was in a terrible format that R couldn't deal with so went back into excel and made a new one
#Attempt 2:

#reassign year to be a character

female_phd_tidy$Year <- as.character(female_phd_tidy$Year)

phd_female_number_graph <- ggplot(female_phd_tidy, aes(x = Year, y = `Number Enrolled`, group = `Field of study and sex`)) +
  geom_col(position = "dodge", aes(fill = `Field of study and sex`)) + 
  theme_classic() + 
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_brewer(palette = "Pastel1", name = "Field of Study") +
  labs(title = "Number of Female PhD Enrollments by Field (1985-2015)")
  
  
phd_female_number_graph
<<<<<<< HEAD

=======

>>>>>>> 505f1ec49d937332e8fe90df0e3eb13e1a3f25d2
#Does this make sense, or is it better to group by year??? 
#I can't get the colors to change :(
#ok making another one that's based on percents  

phd_female_percent_graph <- ggplot(female_phd_tidy, aes(x = Year, y = `Percent Enrolled`, group = `Field of study and sex`)) +
  geom_col(position = "dodge", aes(fill = `Field of study and sex`)) +
   theme_linedraw() + 
  labs(title = "Percent of Female PhD Enrollments by Field (1985-2015)") +
  scale_fill_brewer(palette = "Pastel1", name = "Field of Study")
   
  
phd_female_percent_graph
<<<<<<< HEAD

=======

>>>>>>> 505f1ec49d937332e8fe90df0e3eb13e1a3f25d2
#proptables and chi square


row.names(female_numbers_tidy) <- female_numbers_tidy$`Field of study and sex`
## Warning: Setting row names on a tibble is deprecated.
female_prop_final <- female_numbers_tidy %>% 
  select(`1985`, `2000`, `2015`) 


phd_chi <- chisq.test(female_prop_final)
phd_chi
## 
##  Pearson's Chi-squared test
## 
## data:  female_prop_final
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
#Ok proportions are significantly different, p < 2.2e-16

Ran phd_chi$stdres in the console to find the significant differences. ALl are significantly different from expected.

                                       1985       2000       2015

Physical sciences and earth sciences -10.376557 -8.559263 17.031216 Engineering -24.746239 -12.702359 33.184446 Education 29.337381 7.576085 -32.127993 Humanities and arts -5.666481 7.947822 -2.866323

Question 3: Male and Female Salaries for Starting PostDoc & Other Employment Positions

# Running paired t-tests for the median salaries of male and females

# Regular employment

# It doesn't work! need to do mann whitney U/wilcox


phd_med_sal_tidy_df
## # A tibble: 60 x 7
##    Field                    Median.Salary Sex    Planas  X5    X6    X7   
##    <chr>                            <dbl> <chr>  <chr>   <chr> <chr> <chr>
##  1 Agricultural sciences a…         78000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  2 Biological and biomedic…         75000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  3 Health sciences                  75000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  4 Chemistry                        80000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  5 Geosciences, atmospheri…         75167 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  6 Physics and astronomy            95000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  7 Mathematics and compute…        105000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  8 Psychology                       63000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
##  9 Economics                       105000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
## 10 Social sciences                  64000 FEMALE EMPLOY… <NA>  <NA>  <NA> 
## # ... with 50 more rows
# Question 3

# Exploratory Analysis

# Make histogram of median salary vs sex, Employment
employment_test_df <- phd_med_sal_tidy_df %>% 
  filter(Planas == "EMPLOYMENT")

employment_test_hist <- ggplot(employment_test_df, aes(x = Median.Salary )) +
  geom_histogram(bins = 6, aes(fill = Sex)) +
  facet_wrap(~ Sex, scale = "free") +
  theme_classic() +
  theme(legend.position = "none") + 
  scale_y_continuous(expand = c(0,0))

employment_test_hist

# Make histogram of median salary vs sex, Employment
postdoc_test_df <- phd_med_sal_tidy_df %>% 
  filter(Planas == "POSTDOC")

postdoc_test_hist <- ggplot(postdoc_test_df, aes(x = Median.Salary )) +
  geom_histogram(bins = 6, aes(fill = Sex)) +
  facet_wrap(~ Sex, scale = "free") +
  theme_classic() +
  theme(legend.position = "none") + 
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_brewer(palette = "Pastel1")

postdoc_test_hist

# didn't work

med_emp_male <- c(78000, 75000, 75000, 80000, 75167, 95000, 105000, 63000, 105000, 64000, 95000, 71000, 52000, 123500, 62800)

med_emp_fem <- c(66000, 66000, 75000, 75000, 71750, 97650, 90000, 60000, 95750, 62000, 90000, 63000, 50000, 120000, 61000)

employment_wilcox <- wilcox.test(med_emp_male, med_emp_fem, exact = FALSE, paired = TRUE)

employment_wilcox
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_emp_male and med_emp_fem
## V = 101, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
med_post_male <- c(42750, 42000, 43000, 42000, 50000, 50000, 58000, 42000, 65000, 48000, 45000, 50000, 45000, 60000, 50000)

med_post_fem <- c(44000, 42000, 43250, 42000, 50000, 53000, 55000, 42000, 65000, 49250, 45000, 45000, 45000, 63500, 44000)

postdoc_wilcox <- wilcox.test(med_post_male, med_post_fem, exact = FALSE, paired = TRUE)

postdoc_wilcox
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  med_post_male and med_post_fem
## V = 19.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
# Column Graphs for Visualization

phd_med_sal_tidy_df2 <- phd_med_sal_tidy_df %>% 
  mutate(sex = ifelse(Sex == "FEMALE", "Male", "Female")) %>% 
  select(Field, Median.Salary, Planas, sex)

phd_med_tidy_employ <- phd_med_sal_tidy_df2 %>% 
  filter(Planas == "EMPLOYMENT")

employ_graph <- ggplot(phd_med_tidy_employ, aes(x = Field, y = Median.Salary)) +
  geom_col(position = "dodge", aes(fill = sex)) +
   theme_linedraw() + 
  coord_flip() +
   scale_y_continuous(expand = c(0,0), breaks = seq(0, 125000, 25000), limits = c(0,125000)) +
  labs(title = "Median Salaries in non-PostDoc Positions", x = "Field", y = "Median Salary ($)") +  scale_fill_brewer(palette = "Pastel1", name = "Sex")


employ_graph
<<<<<<< HEAD

=======

>>>>>>> 505f1ec49d937332e8fe90df0e3eb13e1a3f25d2
phd_med_tidy_post <- phd_med_sal_tidy_df2 %>% 
  filter(Planas == "POSTDOC")

postdoc_graph <- ggplot(phd_med_tidy_post, aes(x = Field, y = Median.Salary)) +
  geom_col(position = "dodge", aes(fill = sex)) +
   theme_linedraw() + 
  coord_flip() + 
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 80000, 20000), limits = c(0,80000)) +
  labs(title = "Median Salaries in PostDoc Positions", x = "Field", y = "Median Salary ($)") + scale_fill_brewer(palette = "Pastel1", name = "Sex")

postdoc_graph
<<<<<<< HEAD

=======

>>>>>>> 505f1ec49d937332e8fe90df0e3eb13e1a3f25d2

Question 4: Exploring Academic Salaries for Professors in U.S. Colleges

# Create Histogram Years Since PhD vs Salary

years_since_phd_gph <- ggplot(fac_sal_df, aes(x = Years.Since.PhD, y = Salary)) +
  geom_point(aes(color = Sex), alpha = 0.5) +
  facet_wrap(~ Faculty.Rank) +
  scale_fill_brewer(palette = "Pastel1")

years_since_phd_gph

#Linear regession exploration 
pairs_df <- fac_sal_df %>% 
  select(Years.Since.PhD, Years.Faculty.Service, Salary)

pairs(pairs_df)

#Salary ad years since PHD looks more linear thean Years.faculty.service

salary_lm <- lm(Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm
## 
## Call:
## lm(formula = Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD + 
##     Discipline + Faculty.Rank, data = fac_sal_df)
## 
## Coefficients:
##           (Intercept)                SexMale  Years.Faculty.Service  
##               78862.8                 4783.5                 -489.5  
##       Years.Since.PhD            DisciplineB   Faculty.RankAsstProf  
##                 535.1                14417.6               -12907.6  
##      Faculty.RankProf  
##               32158.4
# Set rank reference level 
fac_sal_df$Faculty.Rank <- fct_relevel(fac_sal_df$Faculty.Rank, "AsstProf")

# b. Model summary:

summary(salary_lm)
## 
## Call:
## lm(formula = Salary ~ Sex + Years.Faculty.Service + Years.Since.PhD + 
##     Discipline + Faculty.Rank, data = fac_sal_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            78862.8     4990.3  15.803  < 2e-16 ***
## SexMale                 4783.5     3858.7   1.240  0.21584    
## Years.Faculty.Service   -489.5      211.9  -2.310  0.02143 *  
## Years.Since.PhD          535.1      241.0   2.220  0.02698 *  
## DisciplineB            14417.6     2342.9   6.154 1.88e-09 ***
## Faculty.RankAsstProf  -12907.6     4145.3  -3.114  0.00198 ** 
## Faculty.RankProf       32158.4     3540.6   9.083  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
AIC(salary_lm)
## [1] 9093.826
# AIC = 9093.826

#Visualize residuals
plot(salary_lm)

vif(salary_lm)
##                           GVIF Df GVIF^(1/(2*Df))
## Sex                   1.030805  1        1.015285
## Years.Faculty.Service 5.923038  1        2.433729
## Years.Since.PhD       7.518936  1        2.742068
## Discipline            1.064105  1        1.031555
## Faculty.Rank          2.013193  2        1.191163
#ok yikes so years of faculty service and years since PhD are super correlated. 
#I feel like basically we only use one of those I feel like that's fine since most people will work right after their phd, doesn't make sense to include both. 


salary_lm2 <- lm(Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm2
## 
## Call:
## lm(formula = Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank, 
##     data = fac_sal_df)
## 
## Coefficients:
##           (Intercept)                SexMale        Years.Since.PhD  
##              67884.32                4349.37                  61.01  
##           DisciplineB  Faculty.RankAssocProf       Faculty.RankProf  
##              13937.47               13104.15               46032.55
summary(salary_lm2)
## 
## Call:
## lm(formula = Salary ~ Sex + Years.Since.PhD + Discipline + Faculty.Rank, 
##     data = fac_sal_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67451 -13860  -1549  10716  97023 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           67884.32    4536.89  14.963  < 2e-16 ***
## SexMale                4349.37    3875.39   1.122  0.26242    
## Years.Since.PhD          61.01     127.01   0.480  0.63124    
## DisciplineB           13937.47    2346.53   5.940 6.32e-09 ***
## Faculty.RankAssocProf 13104.15    4167.31   3.145  0.00179 ** 
## Faculty.RankProf      46032.55    4240.12  10.856  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared:  0.4472, Adjusted R-squared:  0.4401 
## F-statistic: 63.27 on 5 and 391 DF,  p-value: < 2.2e-16
AIC(salary_lm2)
## [1] 9097.22
#AIC = 9097.22

plot(salary_lm2)

#resdiuals are bad, but the qq plot looks good. 

vif(salary_lm2)
##                     GVIF Df GVIF^(1/(2*Df))
## Sex             1.028359  1        1.014080
## Years.Since.PhD 2.065517  1        1.437191
## Discipline      1.055727  1        1.027486
## Faculty.Rank    1.987205  2        1.187301
#woooo all are under 4 so we are good to go re: colinearlity 


#my prediction is that male/female doesn't change the slope of the line, they just have differne t starting point, so we need that interation term to make up for that? 
#I will graph it to see if it's visual
#ok no that predcition was wrong. Sex doesn't appear to be that significant, there's just way more males than females, and way more old males. 

#For exploratory purposes, what about not including sex? 
salary_lm3 <- lm(Salary ~ Years.Since.PhD + Discipline + Faculty.Rank, data = fac_sal_df)
salary_lm3
## 
## Call:
## lm(formula = Salary ~ Years.Since.PhD + Discipline + Faculty.Rank, 
##     data = fac_sal_df)
## 
## Coefficients:
##           (Intercept)        Years.Since.PhD            DisciplineB  
##              71405.40                  71.92               14028.68  
## Faculty.RankAssocProf       Faculty.RankProf  
##              13030.16               46211.57
AIC(salary_lm3)
## [1] 9096.497
#AIC = 9096.497
plot(salary_lm3)

vif(salary_lm3)
##                     GVIF Df GVIF^(1/(2*Df))
## Years.Since.PhD 2.053425  1        1.432978
## Discipline      1.054461  1        1.026869
## Faculty.Rank    1.978933  2        1.186063
#For exploratory purposes, maybe remove faculty rank, since that goes along with years since PHD?

salary_lm4 <- lm(Salary ~ Years.Since.PhD + Discipline, data = fac_sal_df)
salary_lm4
## 
## Call:
## lm(formula = Salary ~ Years.Since.PhD + Discipline, data = fac_sal_df)
## 
## Coefficients:
##     (Intercept)  Years.Since.PhD      DisciplineB  
##           80158             1119            15784
AIC(salary_lm4)
## [1] 9217.566
#AIC = 9217.566
plot(salary_lm4)

vif(salary_lm4)
## Years.Since.PhD      Discipline 
##        1.049937        1.049937
#For exploratory purposes, maybe remove years, since that goes along with years since Rank? Also putting sex back in. 

salary_lm5 <- lm(Salary ~ Faculty.Rank + Discipline + Sex, data = fac_sal_df)
salary_lm5
## 
## Call:
## lm(formula = Salary ~ Faculty.Rank + Discipline + Sex, data = fac_sal_df)
## 
## Coefficients:
##           (Intercept)  Faculty.RankAssocProf       Faculty.RankProf  
##                 68224                  13723                  47403  
##           DisciplineB                SexMale  
##                 13709                   4492
AIC(salary_lm5)
## [1] 9095.454
#AIC = 9095.454
plot(salary_lm5)

vif(salary_lm5)
##                  GVIF Df GVIF^(1/(2*Df))
## Faculty.Rank 1.034437  2        1.008500
## Discipline   1.012236  1        1.006099
## Sex          1.022339  1        1.011108
gender_dif_graph <- ggplot(fac_sal_df, aes(x = Years.Since.PhD, y = Salary, group = Sex)) + 
  geom_point(aes(color = Faculty.Rank, shape = Discipline)) +
  facet_wrap(~Sex) +
  scale_colour_brewer(palette = "Pastel1") + 
  theme_bw()

gender_dif_graph

regression_table <- stargazer(salary_lm5, type = "html")
Dependent variable:
Salary
Faculty.RankAssocProf 13,723.420***
(3,959.014)
Faculty.RankProf 47,403.320***
(3,133.108)
DisciplineB 13,708.690***
(2,295.437)
SexMale 4,491.801
(3,860.240)
Constant 68,223.530***
(4,477.200)
Observations 397
R2 0.447
Adjusted R2 0.441
Residual Std. Error 22,640.990 (df = 392)
F Statistic 79.180*** (df = 4; 392)
Note: p<0.1; p<0.05; p<0.01

end